home *** CD-ROM | disk | FTP | other *** search
- # Parsing The Timeline files of and finding keywords etc.
- # Written: Eli Zvuluny -
- # Possible Worlds
- #
- # First version - 17/1/99
- #
- #
- # The lex values is taken from the file lexItems.txt generated by access.
-
- # require 'StandardFile.pl';
-
- # First start to process all files in the current folder for an existance of
- # keywords (i.e. for a lexicon entries)
-
- use Cwd;
- use English;
-
-
- $keyFile = "lexItems.txt"; # file name generated by access, in the directory "workDir"
- open (ERRFILE, ">keyErr.lex");
- print "==== Processing the keywords (lexItems.txt) file =====\n";
- open (KEYFILE, "<$keyFile") or die "can't open file $keyFile\n";
- while (<KEYFILE>) {
- $current_line = $_;
- chomp;
- ($tmpNum, $tmpKey) = $current_line =~ /^(\d+),(.+)$/;
- $tmpKey =~ tr/a-z/A-Z/;
- # in case of multi words, extract extra blanks
- @tmpArr = split(/\s+/,$tmpKey);
- $tmpKey = join(" ",@tmpArr);
- # print "Before Keys $tmpKey\n";
- ($tmpKey, $parenKey) = $tmpKey =~ /^([^\(\)]+[^\s])\s*(\([^\(\)].*\))*\s*$/;
- # print "Keys $tmpKey, $parenKey\n";
- chop($tmpKey) if $tmpKey =~ /[\,\.\:]$/; # extract possible extra characters
- if ($lexItems{$tmpKey}) {
- print "*** The lexicon item: [$tmpKey] already exists\n";
- print ERRFILE "*** The lexicon item: [$tmpKey] already exists\n";
-
- } else {
- $lexItems{$tmpKey} = $tmpNum;
- $lexNums{$tmpNum} = $tmpKey; # and the inverted list
- }
- if ($parenKey) {
- $parenKey =~ s/\((.*)\)/\1/;
- chop($parenKey) if $parenKey =~ /[\,\.\:]$/; # extract possible extra characters
- if ($lexItems{$parenKey}) {
- print "*** The lexicon item: [$parenKey] already exists\n";
- print ERRFILE "*** The lexicon item: [$parenKey] already exists\n";
-
- } else {
- $lexItems{$parenKey} = $tmpNum;
- $lexNums{$tmpNum} = $tmpKey; # and the inverted list
- }
- }
- } # while KEYFILE
- close (KEYFILE);
- foreach $key (sort (keys (%lexItems))) {
- print "$key $lexItems{$key}\n";
- # check if there is a paired keywords separated with a ","
- if ($key =~ /^([^\,]+)\s*\,\s*([^\,\(\)]+)(\s*\(.*\))*$/) {
- # print "paired - $1 - $2 $key, $lexItems{$key}\n";
- $pairedItems{$1} = $2;
- $pairedItems{$2} = $1;
- $pairLexKey{$1 . " " . $2} = $lexItems{$key};
- $pairLexKey{$2 . " " . $1} = $lexItems{$key};
- # print "paired - $1 - $2 $key, $lexItems{$key}\n";
- if (!$lexItems{$1}) { # the If enforce the existance of a normal entry which = first part of pair
- $lexItems{$1} = $lexItems{$key}; # this will be the last choice for a pair element.
- # $lexNums{$idx-1} = $1;
- }
- $firstPaired{$lexItems{$key}} = $1; # used for a case of multipled paired entries (such as
- # (Kaplan, Moshe - Kaplan, Shimon), in those cases, on the second entry (Shimon), when
- # there is a simple reference to Kaplan (a lot), the system mark it by mistake since
- # it assumes that it refers to the first Kaplan, and not to the titled keyword.
- }
- else { #if it is a multiword, save the first word, and max number of words for that item.
- @tmpArr = split(/\s+/,$key);
- if ($#tmpArr > 1) {
- $tmpVal = $firstWord{$tmpArr[0]};
- if ($#tmpArr > $tmpVal) {
- $firstWord{$tmpArr[0]} = $#tmpArr+1;
- }
- # print " Multiword $Key, $tmpArr[0], $firstWord{$tmpArr[0]}\n";
- }
- } #if - elsif
- if (($beforeParen, $inParen) = $key =~ /^\s*([^\s]+)\s*\((.+)\s*\)$/) {
- chop($beforeParen) if $beforeParen =~ /[\,\.\:]$/; # extract possible extra characters
- chop($inParen) if $inParen =~ /[\,\.\:]$/; # extract possible extra characters
- if (!$lexItems{$inParen}) {
- $lexItems{$inParen} = $lexItems{$key};
- }
- if (!$lexItems{$beforeParen}) {
- $lexItems{$beforeParen} = $lexItems{$key};
- }
- } # there is another keyword in Parentheses
- } #foreach
- $dirname = "lexFiles";
- $currentDir = cwd;
- # print "$currentDir \n";
- # printKeyArr (\%pairLexKey);
- # now get the base dir, which is the parent directory, and the directory separator
- ($dirSep,$lastDir) = $currentDir =~ /([\\\:])([^\\\:]+)$/;
- # $baseDir = $`;
- $baseDir = $PREMATCH;
- @dirVec = ("TLtext");
- foreach $theDir (@dirVec) {
- $dirname = "$baseDir$dirSep" . "Text" . "$dirSep$theDir";
- mkdir ("${dirname}1", umask());
- opendir(CURDIR, $dirname) or die "can't open $dirname\n";
- chdir($dirname);
-
- $lastKey = "";
- $oneLinerName = "oneLiner.txt";
- open (ONELINER, ">${dirname}1$dirSep$oneLinerName");
- print "==== Start processing the Timeline files in $theDir =====\n";
- while ($curFile = readdir(CURDIR)) {
- if ($curFile =~ /tl(\d\d\d\d\d\d).doc/i) {
- $evDate = $1;
- print "-----$curFile----\n";
- $veryBigLine = "";
- open (FILEIN, "<$curFile") or die "can't open file\n";
- open (TXTFILE, ">${dirname}1$dirSep$curFile");
- $oneLineMode = 1;
- $firstPartLine = "";
- while (<FILEIN>) {
- $current_line = $_;
- if ($oneLineMode) {
- chomp $current_line;
- if ($current_line =~ /\@\@\@/) {
- $oneLineMode = 0;
- print ONELINER "$evDate\|$firstPartLine\n";
- } else {
- $firstPartLine .= $current_line;
- }
- } else {
- $veryBigLine = $veryBigLine . $current_line;
- } #if onlinemode
- } # while
- # extract key number from file name
- ($fileIdx) = $curFile =~ /txt(\d+)\.doc/i;
- # set the value of lastKey, which is the key associated with the file.
- $lastKey = $lexNums{$fileIdx};
- # print "Last key is $lastKey\n";
- if (!$lastKey) {
- $lastKey = "stamstamstam";
- }
- &closeKeyItem($lastKey); # handle last keyword
-
- close (FILEIN);
- } #if
- } # while
- } # foreach @dirVec
- close (KEYFILE);
- close (ERRFILE);
- close (ONELINER);
- print "==== Successful processing =====\n";
-
-
- sub closeKeyItem
- {
- local($curKey) = @_;
- if ($curKey eq "") {
- # print "Nothing to do now (really) !!!!!\n";
- } else {
- $veryBigLine =~ s/\n/ # /g;
- # now replace all the Bold and Italic with @b and @i.
- while (($HtmlTag,$innerString) = $veryBigLine =~ /<([IB])>([^[<>]*)<\/\1>/i) {
- # found, replace all the internal words with @i@ or @b@
- $beforeStr = $PREMATCH;
- $afterStr = $POSTMATCH;
- @markedWords = split(/\s+/,$innerString);
- # print " $HtmlTag,$innerString Number is $#markedWords\n";
- if ($#markedWords >= 0) {
- for ($i = 0; $i <= $#markedWords; $i++) {
- if ($markedWords [$i] !~ /^\s*$/) {
- # print "$markedWords[$i]\n";
- if ($markedWords [$i] ne "#") {
- $markedWords [$i] = '@' . $HtmlTag . '@' . $markedWords [$i];
- }
- }
- } # for
- } # if there is something between tag and its closure
- $veryBigLine = $beforeStr . " " . join(" ",@markedWords) . " " . $afterStr;
- } # while
- &handleLines($lastKey, $veryBigLine);
-
- $veryBigLine = join (" ",@splitWords);
- $veryBigLine =~ s/\s*\007\s*/\007/g;
- $veryBigLine =~ s/\007/\n/g;
- print TXTFILE $veryBigLine;
- $veryBigLine = "";
- close (TXTFILE);
- }
- } # closeKeyItem
-
- sub handleLines
- {
- local($curKey, $theLine) = @_;
- my $i, $tmpWord;
-
- # first replace the new line with another symbol
- $theLine =~ s/\n/\007/g;
- @splitWords = split(/\s+/,$theLine);
- # The algorithm for finding if a keyword already exist:
- # First of all naturally translate all words into uppercase characters.
- # The priorities for a keyword is:
- # 1. Key word that contains couple of words.
- # 2. Key words that compose of 2 parts weparated by ",". <part1>,<part2>
- # The possible combinations are: <part1> <part2> | <part2> <part1> | <part1>
- # 3. Single word that did not fulfill any of the previous criterias
- #
- # on All cases, we may try to use the canonic version of the word.
- # in all cases we do not look for the current file keyword.
- $prevWords = "";
- $numOfWords = 0;
- # print "split words $#splitWords\n";
- for ($i = 0; $i <= $#splitWords; $i++) {
- $tmpWord = $splitWords[$i];
- $tmpWord =~ tr/a-z/A-Z/;
- $tmpWord =~ s/^\@[IB]\@(.*)$/\1/i;
- # if ($tmpWord =~ /^<[\/]?b>/i) {
- # print "$tmpWord\n";
- # }
- # $tmpWord =~ s/^<[\/]?b>(.+)<[\/]?b>$/\1/i;
- # as said let's first check if the keyword is part of a multiword item
- # fast seach for firstword existance:
- if ($keyLength = $firstWord{$tmpWord}) {
- if (&checkManyWords($keyLength, $i)) {
- # print "Succesful match of multi words\n";
- $i += $keyLength-1; # skip all matched words ($i was already incremented by 1)
- }
- } # if there is a chance for a multiword key.
- elsif ($keyLength = &checkPairedWords($i)) { # now look of a paired keywords
- # print "Succesful match of paired words\n";
- $i += $keyLength-1; # skip all matched words
- } elsif ($keyLength = &checkOne($i)) {
- # print "Succesful match of Single words\n";
- # no need to skip all matched words
- } else {
- &checkPluralEtc($i);
- }
- # $scanKey = ($prevWords ne "") ? $prevWords . " " . $tmpWord : $tmpWord;
- # if ($lexItems{$scanKey} && $scanKey ne $curKey) {
- # print " Word found: $scanKey, in text -- $curKey\n";
- # @keyWord = split(/\s+/,$scanKey);
- # if ($#keyWord > 1) {
- # print " good results found it\n";
- # $prevWords = "";
- # $numOfWords = 0;
- # } # keywords has more than 1 word
- # } #found exactly as appreas in master keywords
- # elsif ($firstWord{$tmpWord} < 2) {
- # $prevWords = "";
- # $numOfWords = 0;
- # }
- # else {
- # $prevWords = $prevWords . " " . $tmpWord;
- # $numOfWords++;
- # print " a long prev word found --- $prevWords -- $numOfWords\n";
- #
- # } #
- } # for
- }
-
- sub checkManyWords
- {
- local ($numOfWords, $curIndex) = @_;
- my $tmpIdx, $tmpStr;
- my $i;
- $tmpIdx = $curIndex + $numOfWords;
- $tmpStr = $splitWords[$curIndex];
- for ($i = 1; $i < $numOfWords; $i++) {
- # concatenate the next $numOfWords into one keyword
- $tmpStr = $tmpStr . " " . $splitWords[$curIndex+$i];
- } # for
- $tmpStr =~ tr/a-z/A-Z/;
- $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig;
- chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/;
- chop($tmpStr) if $tmpStr =~ /\)$/;
- if ($tmpStr =~ /^\(/) {
- $tmpStr = substr ($tmpStr, 1);
- }
- if ($tmpStr =~ /^(.*)\'s$/i) {
- $tmpStr = $1;
- }
- $theIndex = $lexItems{$tmpStr};
- # print " Value of index $lexItems{$tmpStr}\n" if $theIndex;
- if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx))) {
- # for ($i = 0; $i < $numOfWords; $i++) {
- # $splitWords[$curIndex+$i] =~ s/^/\@$theIndex\@/;
- # print "$splitWords[$curIndex+$i] ";
- # This was the old method
- # $splitWords[$curIndex] =~ s/^/\<$theIndex\>/;
- # $splitWords[$curIndex + $numOfWords-1] =~ s/$/\<\/$theIndex\>/;
- # print "Added tag in checkManyWords - $splitWords[$curIndex]\n";
- for ($i = $curIndex; $i < $curIndex+$numOfWords ; $i++) {
- $splitWords[$i] =~ s/^/\@\+$theIndex\@/;
- }
- # } # for
- # print "\n";
-
- }
- $retVal = $lexItems{$tmpStr};
- } # checkManyWords
-
-
- sub checkPairedWords
- {
- local ($curIndex) = @_;
- my $tmpStr, $secondPair;
-
- $tmpStr = $splitWords[$curIndex];
- $tmpStr =~ tr/a-z/A-Z/;
- $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig;
- if ($secondPair = $pairedItems{$tmpStr}) {
- # print "=-=-=-= $secondPair = $splitWords[$curIndex] =-= $tmpStr\n";
- @tmpArr = split(/\s+/,$secondPair);
- $arrLen = $#tmpArr ? $#tmpArr : 1;
- if (&checkManyPair($arrLen+1, $curIndex)) {
- # print "!!!!Succesful match of First Paired words\n";
- $retVal = $arrLen+1;
- } else {
- $retVal = 0;
- }
- } else {
- $retVal = 0;
- }
- }
-
- sub checkManyPair
- {
- local ($numOfWords, $curIndex) = @_;
- my $tmpIdx, $tmpStr;
- my $i;
- $tmpIdx = $curIndex + $numOfWords;
- $tmpStr = $splitWords[$curIndex];
- for ($i = 1; $i < $numOfWords; $i++) {
- # concatenate the next $numOfWords into one keyword
- $tmpStr = $tmpStr . " " . $splitWords[$curIndex+$i];
- } # for
- # print " in CheckManyPair $tmpStr\n";
- $tmpStr =~ tr/a-z/A-Z/;
- $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig;
- # print " after CheckManyPair $tmpStr\n";
- chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/;
- chop($tmpStr) if $tmpStr =~ /\)$/;
- if ($tmpStr =~ /^\(/) {
- $tmpStr = substr ($tmpStr, 1);
- }
- if ($tmpStr =~ /^(.*)\'s$/i) {
- $tmpStr = $1;
- }
- $theIndex = $pairLexKey{$tmpStr};
- # print " after CheckManyPair the Index $theIndex,$tmpStr\n";
- # print " Value of index (in pair) $pairLexKey{$tmpStr}\n" if $theIndex;
- if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx))) {
- # for ($i = 0; $i < $numOfWords; $i++) {
- # $splitWords[$curIndex+$i] =~ s/^/\@$theIndex\@/;
- # This was the old method
- # $splitWords[$curIndex] =~ s/^/\<$theIndex\>/;
- # $splitWords[$curIndex + $numOfWords-1] =~ s/$/\<\/$theIndex\>/;
- for ($i = $curIndex; $i < $curIndex+$numOfWords ; $i++) {
- $splitWords[$i] =~ s/^/\@\+$theIndex\@/;
- }
- # print "Added tag in checkManyPairs - $splitWords[$curIndex]\n";
- # print "$splitWords[$curIndex+$i] ";
- # } # for
- # print "\n";
-
- }
- $retVal = $theIndex;
- } # checkManyPair
-
- sub checkOne
- {
- local ($curIndex) = @_;
- my $tmpStr;
-
- $tmpStr = $splitWords[$curIndex];
- $tmpStr =~ tr/a-z/A-Z/;
- $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig;
- chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/;
- chop($tmpStr) if $tmpStr =~ /\)$/;
- if ($tmpStr =~ /^\(/) {
- $tmpStr = substr ($tmpStr, 1);
- }
- if ($tmpStr =~ /^(.*)\'S$/) {
- $tmpStr = $1;
- }
-
- $theIndex = $lexItems{$tmpStr};
- # print " Value of index $lexItems{$tmpStr}\n" if $theIndex;
- # if ($theIndex && $firstPaired{$fileIdx} =~ /\b$tmpStr\b/) {
- # print " match first pair in file $fileIdx, entry $tmpStr - $firstPaired{$fileIdx}\n";
- # }
- if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx)) && ($firstPaired{$fileIdx} !~ /\b$tmpStr\b/)) {
- # $splitWords[$curIndex] =~ s/^/\@$theIndex\@/;
- # This was the old method
- # $splitWords[$curIndex] =~ s/^(.*)$/\<$theIndex\>\1\<\/$theIndex\>/;
- $splitWords[$curIndex] =~ s/^/\@\+$theIndex\@/;
- # print "Added tag in checkOne - $splitWords[$curIndex]\n";
- # print "$splitWords[$curIndex]\n ";
- $retVal = 1;
- } else {
- $retVal = 0;
- }
- } # checkOne
-
- sub checkPluralEtc
- {
- local ($curIndex) = @_;
- my $tmpStr;
-
- $tmpStr = $splitWords[$curIndex];
- $tmpStr =~ tr/a-z/A-Z/;
- $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig;
- chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/;
- chop($tmpStr) if $tmpStr =~ /\)$/;
- if ($tmpStr =~ /^\(/) {
- $tmpStr = substr ($tmpStr, 1);
- }
- if ($tmpStr =~ /^(.*)\'S$/) {
- $tmpStr = $1;
- } elsif ($tmpStr =~ /^(.*)IES$/) {
- $tmpStr = $1 . "Y";
- } elsif ($tmpStr =~ /^(.*)ES$/) {
- $tmpStr = $1;
- } elsif ($tmpStr =~ /S$/) { # regular plural
- chop($tmpStr);
- } else {
- return 0;
- }
-
-
- $theIndex = $lexItems{$tmpStr};
- # print " Value of index(Plural) $lexItems{$tmpStr}\n" if $theIndex;
- if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx)) && ($firstPaired{$fileIdx} !~ /\b$tmpStr\b/)) {
- # This was the old method
- # $splitWords[$curIndex] =~ s/^(.*)$/\<$theIndex\>\1\<\/$theIndex\>/;
- $splitWords[$curIndex] =~ s/^/\@\+$theIndex\@/;
- # print "Added tag in checkPlural Etc. - $splitWords[$curIndex]\n";
- # $splitWords[$curIndex] =~ s/^/\@$theIndex\@/;
- # print "(Plural) $splitWords[$curIndex]\n ";
- $retVal = 1;
- } else {
- $retVal = 0;
- }
- } # checkPluralEtc
-
- sub printKeyArr
- {
- local($arRef) = @_;
- foreach $key (keys (%$arRef)) {
- print " The Key $key and its val $$arRef{$key}\n";
- }
- }
-